From 1be1142d47bf7bfe16868be62805b7aedc866954 Mon Sep 17 00:00:00 2001
From: Daniel Sabo <DanielSabo@gmail.com>
Date: Sun, 20 Dec 2015 03:14:19 -0800
Subject: [PATCH] Add SSE4.1 u8 -> float conversions

---
 configure.ac           |  24 +++++
 extensions/Makefile.am |   3 +
 extensions/sse4-int8.c | 218 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 extensions/sse4-int8.c

diff --git a/configure.ac b/configure.ac
index 66ebc77..f09c7ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -299,10 +299,15 @@ AC_ARG_ENABLE(sse2,
   [  --enable-sse2            enable SSE2 support (default=auto)],,
   enable_sse2=$enable_sse)
 
+AC_ARG_ENABLE(sse4_1,
+  [  --enable-sse4_1            enable SSE4_1 support (default=auto)],,
+  enable_sse4_1=$enable_sse)
+
 if test "x$enable_mmx" = xyes; then
   BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
   SSE_EXTRA_CFLAGS=
   SSE2_EXTRA_CFLAGS=
+  SSE4_1_EXTRA_CFLAGS=
 
   AC_MSG_CHECKING(whether we can compile MMX code)
 
@@ -353,6 +358,24 @@ if test "x$enable_mmx" = xyes; then
           AC_MSG_RESULT(no)
           AC_MSG_WARN([The assembler does not support the SSE2 command set.])
         )
+
+        if test "x$enable_sse4_1" = xyes; then
+          BABL_DETECT_CFLAGS(sse4_1_flag, '-msse4.1')
+          SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse4_1_flag"
+
+          AC_MSG_CHECKING(whether we can compile SSE4_1 code)
+
+          CFLAGS="$CFLAGS $sse4_1_flag"
+
+          AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("pmovzxbd %xmm0,%xmm1");])],
+            AC_DEFINE(USE_SSE4_1, 1, [Define to 1 if SSE4_1 assembly is available.])
+            AC_MSG_RESULT(yes)
+          ,
+            enable_sse4_1=no
+            AC_MSG_RESULT(no)
+            AC_MSG_WARN([The assembler does not support the SSE4_1 command set.])
+          )
+        fi
       fi
 
     fi
@@ -367,6 +390,7 @@ if test "x$enable_mmx" = xyes; then
   AC_SUBST(MMX_EXTRA_CFLAGS)
   AC_SUBST(SSE_EXTRA_CFLAGS)
   AC_SUBST(SSE2_EXTRA_CFLAGS)
+  AC_SUBST(SSE4_1_EXTRA_CFLAGS)
 fi
 
 
diff --git a/extensions/Makefile.am b/extensions/Makefile.am
index 4a3fb8a..cd7e893 100644
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -31,6 +31,7 @@ ext_LTLIBRARIES = \
 	sse2-float.la   \
 	sse2-int8.la    \
 	sse2-int16.la   \
+	sse4-int8.la    \
 	two-table.la	\
 	ycbcr.la
 
@@ -48,6 +49,7 @@ HSV_la_SOURCES = HSV.c
 sse2_float_la_SOURCES = sse2-float.c
 sse2_int8_la_SOURCES = sse2-int8.c
 sse2_int16_la_SOURCES = sse2-int16.c
+sse4_int8_la_SOURCES = sse4-int8.c
 two_table_la_SOURCES = two-table.c two-table-tables.h
 ycbcr_la_SOURCES = ycbcr.c
 float_la_SOURCES = float.c
@@ -59,3 +61,4 @@ LIBS = $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la $(MATH_LIB) \
 sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
 sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
+sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS)
diff --git a/extensions/sse4-int8.c b/extensions/sse4-int8.c
new file mode 100644
index 0000000..73f63e3
--- /dev/null
+++ b/extensions/sse4-int8.c
@@ -0,0 +1,218 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2013 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE4_1)
+
+/* SSE 4 */
+#include <smmintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+static inline long
+conv_y8_yF (const uint8_t *src, float *dst, long samples)
+{
+  const float     factor = 1.0f / 255.0f;
+  const __v4sf    factor_vec = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f};
+  const uint32_t *s_vec;
+  __v4sf         *d_vec;
+
+  long n = samples;
+
+  s_vec = (const uint32_t *)src;
+  d_vec = (__v4sf *)dst;
+
+  while (n >= 4)
+    {
+      __m128i in_val;
+      __v4sf out_val;
+      in_val = _mm_insert_epi32 (in_val, *s_vec++, 0);
+      in_val = _mm_cvtepu8_epi32 (in_val);
+      out_val = _mm_cvtepi32_ps (in_val) * factor_vec;
+      _mm_storeu_ps ((float *)d_vec++, out_val);
+      n -= 4;
+    }
+
+  src = (const uint8_t *)s_vec;
+  dst = (float *)d_vec;
+
+  while (n)
+    {
+      *dst++ = (float)(*src++) * factor;
+      n -= 1;
+    }
+
+  return samples;
+}
+
+static long
+conv_ya8_yaF (const uint8_t *src, float *dst, long samples)
+{
+  return conv_y8_yF (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgb8_rgbF (const uint8_t *src, float *dst, long samples)
+{
+  return conv_y8_yF (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgba8_rgbaF (const uint8_t *src, float *dst, long samples)
+{
+  return conv_y8_yF (src, dst, samples * 4) / 4;
+}
+
+#endif
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE4_1)
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgba8_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("u8"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaF_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgba8_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("u8"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbF_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgb8_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("u8"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgbF_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *rgb8_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("u8"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *yaF_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *ya8_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("u8"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaF_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *ya8_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("u8"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yF_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *y8_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("u8"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *yF_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    NULL);
+  const Babl *y8_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("u8"),
+    babl_component ("Y'"),
+    NULL);
+
+#define CONV(src, dst) \
+{ \
+  babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
+  babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
+}
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1))
+    {
+      CONV(rgba8, rgbaF);
+      CONV(rgb8,  rgbF);
+      CONV(ya8,   yaF);
+      CONV(y8,    yF);
+    }
+
+#endif
+
+  return 0;
+}
+
-- 
2.30.2